Code
# Load required libraries
library(tidyverse)
library(maps)
library(ggplot2)
library(gridExtra)
library(sf)
library(viridis)
library(scales)
library(dplyr)
library(randomForest)
library(caret)
library(pdp)
library(knitr)
library(kableExtra)# Load required libraries
library(tidyverse)
library(maps)
library(ggplot2)
library(gridExtra)
library(sf)
library(viridis)
library(scales)
library(dplyr)
library(randomForest)
library(caret)
library(pdp)
library(knitr)
library(kableExtra)Data from Kroodsma et al. (2018) Science, accessible at: Global Fishing Watch Data Download Portal.
The data is accessible up to the end of the year 2020, we used four entire years of data (2017, 2018, 2019, 2020) to match SAR data records.
# Set the path to the 2017-2020 folder
#path <- "Data/AIS Fishing Effort 2017-2020"
# List all CSV files in the folder
#AIS_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE, recursive = TRUE)
# Read all CSV files and combine them into a single data frame
#AIS_fishing <- AIS_csv_files %>%
# map_df(~read_csv(.))
load(here::here("Data","AIS_fishing.Rdata"))
# Aggregate fishing hours by latitude and longitude
aggregated_AIS_fishing <- AIS_fishing %>%
group_by(cell_ll_lat, cell_ll_lon) %>%
summarise(total_fishing_hours = sum(fishing_hours, na.rm = TRUE)) %>%
ungroup() %>%
# Remove any cells with zero or negative fishing hours
filter(total_fishing_hours > 0)
# Function to standardize coordinates to 0.1 degree resolution
standardize_coords <- function(lon, lat) {
list(
lon_std = floor(lon * 10) / 10,
lat_std = floor(lat * 10) / 10
)
}
# Standardize and aggregate AIS data
AIS_data_std <- aggregated_AIS_fishing %>%
mutate(coords = map2(cell_ll_lon, cell_ll_lat, standardize_coords)) %>%
mutate(
lon_std = map_dbl(coords, ~ .x$lon_std),
lat_std = map_dbl(coords, ~ .x$lat_std)
) %>%
group_by(lon_std, lat_std) %>%
summarise(total_fishing_hours = sum(total_fishing_hours, na.rm = TRUE), .groups = "drop")
# Create the world map
world_map <- map_data("world")
# Create the plot
ggplot() +
geom_map(data = world_map, map = world_map,
aes(long = long, lat = lat, map_id = region),
color = "black", fill = "lightgray", size = 0.1) +
geom_tile(data = AIS_data_std,
aes(x = lon_std, y = lat_std, fill = total_fishing_hours)) +
scale_fill_viridis(
option = "inferno",
direction = -1,
trans = "log1p",
name = "AIS Fishing Effort (hours; 2017-2020)",
breaks = c(0, 1, 10, 100, 1000, 10000, 100000),
labels = scales::comma,
guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
) +
coord_fixed(1.3) +
theme_minimal() +
labs(title = NULL,
x = "Longitude", y = "Latitude") +
theme(
legend.position = "bottom",
legend.direction = "horizontal",
legend.box = "vertical",
legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
legend.title = element_text(margin = ggplot2::margin(b = 10))
)Data from Paolo et al. 2024 Nature, accessible at: Global Fishing Watch SAR Vessel Detections
The data is accessible from 2017 to today, we used four entire years of data (2017, 2018, 2019, 2020) to match AIS records.
# Set the path to the 2016 folder
#path <- "Data/SAR Vessel detections 2017-2020"
# List all CSV files in the folder
#SAR_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE)
# Read all CSV files and combine them into a single data frame
#SAR_fishing <- SAR_csv_files %>%
# map_df(~read_csv(.))
load(here::here("Data","SAR_fishing.Rdata"))
# Aggregate fishing hours by latitude and longitude
aggregated_SAR_fishing <- SAR_fishing %>%
mutate(
lat_rounded = round(lat, digits = 2),
lon_rounded = round(lon, digits = 2)
) %>%
group_by(lat_rounded, lon_rounded) %>%
filter(fishing_score >= 0.9) %>%
summarise(
total_presence_score = sum(presence_score, na.rm = TRUE),
avg_fishing_score = mean(fishing_score, na.rm = TRUE),
count = n()
) %>%
mutate(total_presence_score = round(total_presence_score, digits = 0)) %>%
ungroup()
# Standardize and aggregate SAR data
SAR_data_std <- aggregated_SAR_fishing %>%
mutate(coords = map2(lon_rounded, lat_rounded, standardize_coords)) %>%
mutate(
lon_std = map_dbl(coords, ~ .x$lon_std),
lat_std = map_dbl(coords, ~ .x$lat_std)
) %>%
group_by(lon_std, lat_std) %>%
summarise(total_presence_score = sum(total_presence_score, na.rm = TRUE), .groups = "drop")
# Create the world map
world_map <- map_data("world")
# Create the plot
ggplot() +
geom_map(data = world_map, map = world_map,
aes(long = long, lat = lat, map_id = region),
color = "black", fill = "lightgray", size = 0.1) +
geom_tile(data = SAR_data_std,
aes(x = lon_std, y = lat_std, fill = total_presence_score)) +
scale_fill_viridis(
option = "inferno",
direction = -1,
trans = "log1p",
name = "Fishing vessel detections (2017-2020)",
breaks = c(0, 1, 10, 100, 1000, 10000),
labels = scales::comma,
guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
) +
coord_fixed(1.3) +
theme_minimal() +
labs(title = NULL,
x = "Longitude", y = "Latitude") +
theme(
legend.position = "bottom",
legend.direction = "horizontal",
legend.box = "vertical",
legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
legend.title = element_text(margin = ggplot2::margin(b = 10))
)Identifying grid cells with only AIS, only SAR detections or both data types.
# Merge the datasets
combined_data <- full_join(
AIS_data_std,
SAR_data_std,
by = c("lon_std", "lat_std")
)
# Categorize each cell
combined_data <- combined_data %>%
mutate(category = case_when(
total_fishing_hours > 0 & total_presence_score > 0 ~ "Both AIS and SAR",
total_fishing_hours > 0 & (is.na(total_presence_score) | total_presence_score == 0) ~ "Only AIS",
(is.na(total_fishing_hours) | total_fishing_hours == 0) & total_presence_score > 0 ~ "Only SAR",
TRUE ~ "No fishing detected"
))
# Create the world map
world_map <- map_data("world")
# Create the plot
world_plot <- ggplot() +
geom_map(data = world_map, map = world_map,
aes(long = long, lat = lat, map_id = region),
color = "black", fill = "lightgray", size = 0.1) +
geom_tile(data = combined_data,
aes(x = lon_std, y = lat_std, fill = category)) +
scale_fill_manual(
values = c("Both AIS and SAR" = "purple",
"Only AIS" = "blue",
"Only SAR" = "red",
"No fishing detected" = "white"),
name = "Fishing Data Source",
guide = guide_legend(title.position = "top", title.hjust = 0.5)
) +
coord_fixed(1.3) +
theme_minimal() +
labs(title = "Global Fishing Detection",
subtitle = "Comparison of AIS (2017-2020) and SAR (2017-2020) data at 0.1-degree resolution",
x = "Longitude", y = "Latitude") +
theme(
legend.position = "bottom",
legend.direction = "horizontal",
legend.box = "vertical",
legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
legend.title = element_text(margin = ggplot2::margin(b = 10))
)
# Display the plot
print(world_plot)# Get summary statistics
summary_stats <- combined_data %>%
count(category) %>%
mutate(percentage = n / sum(n) * 100) %>%
rename(`Number of cells` = n) %>%
mutate(percentage = round(percentage, 2))
# Create and display the table
kable(summary_stats,
format = "html",
col.names = c("Category", "Number of cells", "Percentage (%)"),
caption = "Summary Statistics of Data Categories") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE,
position = "center") %>%
column_spec(2, width = "150px") %>%
column_spec(3, width = "150px")| Category | Number of cells | Percentage (%) |
|---|---|---|
| Both AIS and SAR | 163095 | 9.12 |
| Only AIS | 1566190 | 87.60 |
| Only SAR | 58668 | 3.28 |
Predicting fishing hours in areas with only SAR data at a 0.1 degree resolution. Using a random forest model of >160,000 associations between SAR vessel detections and AIS fishing hours globally.
#---- Open the AIS data
#load(here::here("Data", "AIS_fishing.Rdata"))
# Aggregate fishing hours by latitude and longitude
aggregated_AIS_fishing <- AIS_fishing %>%
group_by(cell_ll_lat, cell_ll_lon) %>%
summarise(total_fishing_hours = sum(fishing_hours, na.rm = TRUE)) %>%
ungroup() %>%
# Remove any cells with zero or negative fishing hours
filter(total_fishing_hours > 0)
#---- Open the SAR data
#load(here::here("Data", "SAR_fishing.Rdata"))
# Aggregate fishing hours by latitude and longitude
aggregated_SAR_fishing <- SAR_fishing %>%
mutate(
lat_rounded = round(lat, digits = 2),
lon_rounded = round(lon, digits = 2)
) %>%
# filter(matched_category == "fishing") %>%
group_by(lat_rounded, lon_rounded) %>%
filter(fishing_score >= 0.9) %>%
summarise(
total_presence_score = sum(presence_score, na.rm = TRUE),
avg_fishing_score = mean(fishing_score, na.rm = TRUE),
count = n()
) %>%
mutate(total_presence_score = round(total_presence_score, digits = 0)) %>%
ungroup()
# Function to standardize coordinates to 0.1 degree resolution
standardize_coords <- function(lon, lat) {
list(
lon_std = floor(lon * 10) / 10,
lat_std = floor(lat * 10) / 10
)
}
# Aggregate AIS data to 0.1 degree resolution
AIS_data_01deg <- aggregated_AIS_fishing %>%
mutate(coords = purrr::map2(cell_ll_lon, cell_ll_lat, standardize_coords)) %>%
mutate(
lon_std = sapply(coords, function(x) x$lon_std),
lat_std = sapply(coords, function(x) x$lat_std)
) %>%
group_by(lon_std, lat_std) %>%
summarise(total_fishing_hours = sum(total_fishing_hours, na.rm = TRUE), .groups = "drop")
# Aggregate SAR data to 0.1 degree resolution
SAR_data_01deg <- aggregated_SAR_fishing %>%
mutate(coords = purrr::map2(lon_rounded, lat_rounded, standardize_coords)) %>%
mutate(
lon_std = sapply(coords, function(x) x$lon_std),
lat_std = sapply(coords, function(x) x$lat_std)
) %>%
group_by(lon_std, lat_std) %>%
summarise(total_presence_score = sum(total_presence_score, na.rm = TRUE), .groups = "drop")
# Merge the datasets
combined_data_01deg <- full_join(
AIS_data_01deg,
SAR_data_01deg,
by = c("lon_std", "lat_std")
) %>%
mutate(
has_AIS = !is.na(total_fishing_hours) & total_fishing_hours > 0,
has_SAR = !is.na(total_presence_score) & total_presence_score > 0
)
# Separate the data into training (both AIS and SAR) and prediction (only SAR) sets
training_data <- combined_data_01deg %>%
filter(has_AIS & has_SAR) %>%
select(total_fishing_hours, total_presence_score, lon_std, lat_std)
prediction_data <- combined_data_01deg %>%
filter(!has_AIS & has_SAR) %>%
select(total_presence_score, lon_std, lat_std)
# Train the random forest model with timing
#set.seed(123) # for reproducibility
#rf_timing <- system.time({
# rf_model_01deg <- randomForest(
# total_fishing_hours ~ total_presence_score + lon_std + lat_std,
# data = training_data,
# ntree = 500,
# importance = TRUE
# )
#})
# Print the time taken
#print(paste("Random Forest model training time:", rf_timing["elapsed"], "seconds"))
#save(rf_model_01deg,file="rf_model_01deg.Rdata")
load(here::here("rf_model_01deg.Rdata"))
#Visualise results
# Make predictions
predictions <- predict(rf_model, newdata = prediction_data)
# Add predictions to the original dataset
combined_data_01deg <- combined_data_01deg %>%
mutate(
predicted_fishing_hours = case_when(
has_AIS ~ total_fishing_hours,
has_SAR ~ predict(rf_model, newdata = select(., total_presence_score, lon_std, lat_std)),
TRUE ~ 0
)
)
# Create the world map
world_map <- map_data("world")
#Map of predicted fishing hours only
# Prepare the data for the map
map_data <- combined_data_01deg %>%
filter(!has_AIS & has_SAR) %>%
select(lon_std, lat_std, predicted_fishing_hours)
# Create the map
predicted_SAR_only_plot <- ggplot() +
geom_map(data = world_map, map = world_map,
aes(long, lat, map_id = region),
color = "darkgray", fill = "lightgray", size = 0.1) +
geom_tile(data = map_data,
aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +
scale_fill_viridis(
option = "inferno",
direction = -1,
trans = "log1p",
name = "Predicted fishing hours (2017-2020)",
breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
labels = scales::comma,
guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
) +
coord_fixed(1.3) +
theme_minimal() +
labs(title = "Predicted Fishing Hours in Areas with Only SAR Detections",
subtitle = "0.1 degree resolution",
x = "Longitude", y = "Latitude") +
theme(
legend.position = "bottom",
legend.direction = "horizontal",
legend.box = "vertical",
legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
legend.title = element_text(margin = ggplot2::margin(b = 10))
)
# Print the map
print(predicted_SAR_only_plot)#Map of both original and predicted AIS fishing hours
# Visualize the results
predicted_plot <- ggplot() +
geom_map(data = world_map, map = world_map,
aes(long, lat, map_id = region),
color = "black", fill = "lightgray", size = 0.1) +
geom_tile(data = combined_data_01deg,
aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +
scale_fill_viridis(
option = "inferno",
direction = -1,
trans = "log1p",
name = "AIS Fishing Effort (hours; 2017-2020)",
breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
labels = scales::comma,
guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
) +
coord_fixed(1.3) +
theme_minimal() +
labs(title = "Global Fishing Hours (0.1 degree resolution)",
subtitle = "Based on AIS data and Random Forest predictions from SAR data",
x = "Longitude", y = "Latitude") +
theme(
legend.position = "bottom",
legend.direction = "horizontal",
legend.box = "vertical",
legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
legend.title = element_text(margin = ggplot2::margin(b = 10))
)
print(predicted_plot)#Aggregate data to 1 degree resolution
# First, round the coordinates to the nearest degree
combined_data_1deg <- combined_data_01deg %>%
mutate(
lon_1deg = round(lon_std),
lat_1deg = round(lat_std)
)
# Aggregate the data
aggregated_data <- combined_data_1deg %>%
group_by(lon_1deg, lat_1deg) %>%
summarise(
predicted_fishing_hours = sum(predicted_fishing_hours, na.rm = TRUE),
.groups = 'drop'
)
# Create the world map
world_map <- map_data("world")
# Create the map
predicted_plot_1deg <- ggplot() +
geom_map(data = world_map, map = world_map,
aes(long, lat, map_id = region),
color = "black", fill = "lightgray", size = 0.1) +
geom_tile(data = aggregated_data,
aes(x = lon_1deg, y = lat_1deg, fill = predicted_fishing_hours)) +
scale_fill_viridis(
option = "inferno",
direction = -1,
trans = "log1p",
name = "AIS Fishing Effort (hours; 2017-2020)",
breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
labels = scales::comma,
guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
) +
coord_fixed(1.3) +
theme_minimal() +
labs(title = "Global Fishing Hours (1 degree resolution)",
subtitle = "Based on AIS data and Random Forest predictions from SAR data made at 0.1 degree resolution",
x = "Longitude", y = "Latitude") +
theme(
legend.position = "bottom",
legend.direction = "horizontal",
legend.box = "vertical",
legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
legend.title = element_text(margin = ggplot2::margin(b = 10))
)
# Print the map
print(predicted_plot_1deg)# Evaluate the model
# Function to evaluate models
evaluate_model <- function(model, data, log_target = FALSE) {
predictions <- predict(model, newdata = data)
if (log_target) {
predictions <- 10^predictions - 1
}
actual <- data$total_fishing_hours
# Basic Error Metrics
mae <- mean(abs(actual - predictions), na.rm = TRUE)
rmse <- sqrt(mean((actual - predictions)^2, na.rm = TRUE))
mape <- mean(abs((actual - predictions) / actual) * 100, na.rm = TRUE)
medae <- median(abs(actual - predictions), na.rm = TRUE)
# R-squared and Adjusted R-squared
ss_total <- sum((actual - mean(actual))^2, na.rm = TRUE)
ss_residual <- sum((actual - predictions)^2, na.rm = TRUE)
r_squared <- 1 - (ss_residual / ss_total)
n <- length(actual)
p <- length(model$forest$independent.variable.names) # Number of predictors
adj_r_squared <- 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))
# Residual Analysis
residuals <- actual - predictions
mean_residual <- mean(residuals, na.rm = TRUE)
sd_residual <- sd(residuals, na.rm = TRUE)
# Feature Importance (for Random Forest)
feature_importance <- importance(model)
return(list(
"Mean Absolute Error" = mae, # Mean Absolute Error
"Root Mean Squared Error" = rmse, # Root Mean Squared Error
"Mean Absolute Percentage Error" = mape, # Mean Absolute Percentage Error
"Median Absolute Error" = medae, # Median Absolute Error
"R-Squared" = r_squared, # R-Squared (Coefficient of Determination)
"Adjusted R-Squared" = adj_r_squared, # Adjusted R-Squared
"Mean of Residuals" = mean_residual, # Mean of Residuals
"Standard Deviation of Residuals" = sd_residual, # Standard Deviation of Residuals
"Feature Importance" = feature_importance # Feature Importance
))
}
# Merge the datasets
combined_data_01deg <- full_join(
AIS_data_01deg,
SAR_data_01deg,
by = c("lon_std", "lat_std")
) %>%
mutate(
has_AIS = !is.na(total_fishing_hours) & total_fishing_hours > 0,
has_SAR = !is.na(total_presence_score) & total_presence_score > 0,
data_category = case_when(
has_AIS & has_SAR ~ "Both AIS and SAR",
has_AIS & !has_SAR ~ "Only AIS",
!has_AIS & has_SAR ~ "Only SAR",
TRUE ~ "No fishing detected"
)
)
# Evaluate all models
validation_data <- combined_data_01deg %>% filter(data_category == "Both AIS and SAR")
results_no_transform <- evaluate_model(rf_model, validation_data)
# Create a dataframe for the table
results_table <- data.frame(
Metric = c("Mean Absolute Error", "Root Mean Squared Error", "Mean Absolute Percentage Error",
"Median Absolute Error", "R-Squared", "Adjusted R-Squared",
"Mean of Residuals", "Standard Deviation of Residuals"),
Value = c(results_no_transform$`Mean Absolute Error`,
results_no_transform$`Root Mean Squared Error`,
results_no_transform$`Mean Absolute Percentage Error`,
results_no_transform$`Median Absolute Error`,
results_no_transform$`R-Squared`,
results_no_transform$`Adjusted R-Squared`,
results_no_transform$`Mean of Residuals`,
results_no_transform$`Standard Deviation of Residuals`)
)
# Create and display the table
kable(results_table, format = "html", digits = 4, caption = "Model Evaluation Metrics") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE, position = "center")| Metric | Value |
|---|---|
| Mean Absolute Error | 275.0480 |
| Root Mean Squared Error | 1177.0435 |
| Mean Absolute Percentage Error | 1965.4999 |
| Median Absolute Error | 55.1906 |
| R-Squared | 0.9166 |
| Adjusted R-Squared | 0.9166 |
| Mean of Residuals | -0.1617 |
| Standard Deviation of Residuals | 1177.0471 |
# For feature importance, create a separate table
feature_importance <- as.data.frame(results_no_transform$`Feature Importance`)
feature_importance$Feature <- rownames(feature_importance)
feature_importance <- feature_importance[, c("Feature", "%IncMSE", "IncNodePurity")]
colnames(feature_importance) <- c("Feature", "%IncMSE", "IncNodePurity")
# Sort the feature importance table by %IncMSE in descending order
feature_importance <- feature_importance[order(-feature_importance$`%IncMSE`), ]
kable(feature_importance, format = "html", digits = 4, caption = "Feature Importance") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"),
full_width = FALSE, position = "center") %>%
column_spec(1, bold = TRUE) %>%
column_spec(2:3, width = "150px")| Feature | %IncMSE | IncNodePurity | |
|---|---|---|---|
| total_presence_score | total_presence_score | 98.8369 | 987162725907 |
| lon_std | lon_std | 43.6550 | 791743926811 |
| lat_std | lat_std | 38.6756 | 696033312224 |
Model evaluation interpretation:
Strengths: The model explains a large portion of the variance in the data (high R² and Adjusted R²), and the median error (MedAE) is reasonably low, suggesting that the model is accurate for many of the predictions.
Weaknesses: The high RMSE, MAPE, and SD of residuals indicate the presence of some large errors and possible outliers that are skewing the results. The very high MAPE is particularly concerning, suggesting that the model may struggle with predictions for certain instances, possibly due to the nature of the data.
Feature Importance: The model relies heavily on the total_presence_score (total vessel detections) feature, with geographical location features also playing significant roles.
To improve the model, consider investigating the outliers and errors, potentially refining the model or using alternative modeling approaches that might handle these cases better.